1 Introduction

Figures for Marta’s paper.

library(rhdf5)
library(dplyr)
library(Seurat)
library(scales)
library(ggplot2)
suppressPackageStartupMessages(library(ComplexHeatmap))

plot_ps <- function(adata, by='dpt_pseudotime') {
  f_df <- as.data.frame(adata@reductions$umap@cell.embeddings) 
  f_df$color <- adata@meta.data[[by]]
  
  f_plot <- ggplot(f_df, aes(x=UMAP_1, y=UMAP_2, color=color)) +
    geom_point() + scale_color_continuous(type = "viridis")
  # scale_color_gradient(low="blue", high="yellow")
  
  return (f_plot)
}

2 Load dataset

counts <- as.data.frame(h5read('../data/processed/04_dataset.h5ad', 'raw/X'))
colnames(counts) <- h5read('../data/processed/04_dataset.h5ad', '/obs/Well_ID')
rownames(counts) <- h5read('../data/processed/04_dataset.h5ad', '/var/_index')

metadata <- read.csv('../data/processed/04_dataset-metadata.csv')
rownames(metadata) <- metadata$Well_ID

subset <- CreateSeuratObject(counts, meta.data = metadata)

subset@assays$RNA@scale.data <- h5read('../data/processed/04_dataset.h5ad', '/X')
# colnames(subset@assays$RNA@scale.data) <- h5read('../data/processed/04_dataset.h5ad', '/obs/Well_ID')
# rownames(subset@assays$RNA@scale.data) <- h5read('../data/processed/04_dataset.h5ad', '/var/_index')

subset@assays$RNA@scale.data <- as.matrix(counts)
subset@meta.data$louvain <- as.factor(subset@meta.data$louvain)

umap_coor <- t(h5read('../data/processed/04_dataset.h5ad', '/obsm/X_umap'))
rownames(umap_coor) <- h5read('../data/processed/04_dataset.h5ad', '/obs/Well_ID')
colnames(umap_coor) <- c('UMAP_1', 'UMAP_2')

subset@reductions$umap <- Seurat::CreateDimReducObject(
  embeddings = umap_coor,
  assay = 'RNA',
  key = 'UMAP_'
)

cc <- readRDS('../data/mouse_cell_cycle_genes.rds')
stage_colors <- c('2iLIF_PrE' = '#8E9090',
                  # 'D0_PrE' = '#97918C',
                  'D1' = '#11839A',
                  'D2' = '#8E8E0D',
                  'D3' = '#E4B32E',
                  'D4' = '#C4651C',
                  'D7' = '#BD4306'
                )
seurat_colors <- c('0' = '#0DA257',
                   '1' = '#A6856D',
                   '2' = '#0096BF',
                   '3' = '#C28153',
                   '4' = '#A6363A',
                   '5' = '#4F9996',
                   '6' = '#8C8177',
                   '7' = '#E89E85',
                   '8' = '#B6B44E'
                  )
cc_colors <- c('#FAA198', '#BDB76B', '#FFE4C4')
subset <- CellCycleScoring(subset, s.features = cc$s.genes, g2m.features = cc$g2m.genes)
DimPlot(subset, group.by = 'louvain', cols = as.vector(seurat_colors))

DimPlot(subset, group.by = 'Stage', cols = as.vector(stage_colors))

3 Counts of cells per cluster

subset@meta.data %>% 
  group_by(Stage) %>% 
  summarize('counts' = n())
# Undiff
subset@meta.data %>% 
  filter(louvain %in% c('0', '2')) %>%
  group_by(Stage) %>% 
  summarize('counts' = n())
# Diff
subset@meta.data %>% 
  filter(louvain %in% c('3', '4', '7')) %>%
  group_by(Stage) %>% 
  summarize('counts' = n())

4 Normalize cell size

Estimating mean and median from 2iLIF. Only take into account 2iLIF which were loaded with other Stages. Plate 1 which is only 2iLIF is ignored.

full_meta <- read.csv('../data/processed/04_dataset-metadata.csv')
tmp <- full_meta[(full_meta$plate_ID != '1' & full_meta$Stage == '2iLIF_PrE'), 'P3.FSC.A.Geo.Mean']

cell_mean <- mean(tmp, na.rm=T)
cell_median <- median(tmp, na.rm=T)
subset@meta.data$cs_mean <- subset$P3.FSC.A.Geo.Mean / cell_mean
subset@meta.data$cs_median <- subset$P3.FSC.A.Geo.Mean / cell_median

5 Specify trajectory paths

subset@meta.data['Status'] <- c()
subset@meta.data[subset@meta.data$louvain %in% c('1', '6', '5'), 'Status'] <- '2iLIF'
subset@meta.data[subset@meta.data$louvain %in% c('3', '4', '7'), 'Status'] <- 'PrEDiff'
subset@meta.data[subset@meta.data$louvain %in% c('0', '2'), 'Status'] <- 'NEDiff'
DimPlot(subset, group.by = 'Stage', cols = stage_colors) + theme_classic(base_size = 12)

DimPlot(subset, group.by = 'louvain', cols = seurat_colors) + theme_classic(base_size = 12)

DimPlot(subset, group.by = 'Phase', cols = cc_colors) + theme_classic(base_size = 12)

plot_ps(subset) + theme_classic(base_size = 12)

5.1 Bonus plot

tmp <- subset
Idents(tmp) <- tmp$louvain

commonR::plot_proportion_bar(tmp, group.by = 'Stage', order = c('1', '6', '5', '0', '2', '3', '7', '4')) + 
  ggtitle('Stage proportion per per cluster') +
  scale_fill_manual(values = stage_colors) +
  theme_classic(base_size = 12)

p1 <- commonR::plot_proportion_bar(tmp[, tmp$Status == '2iLIF'], group.by = 'Stage') + 
  facet_grid(cols = vars('2iLIF')) +
  scale_fill_manual(values = stage_colors) +
  theme(legend.position="none")

p2 <- commonR::plot_proportion_bar(tmp[, tmp$Status == 'NEDiff'], group.by = 'Stage') + 
  facet_grid(cols = vars('NEDiff')) +
  scale_fill_manual(values = stage_colors) +
  theme(legend.position="none")

p3 <- commonR::plot_proportion_bar(tmp[, tmp$Status == 'PrEDiff'], group.by = 'Stage',
                                   order = c('3', '7', '4')) + 
  facet_grid(cols = vars('PrEDiff')) +
  scale_fill_manual(values = as.vector(stage_colors), breaks=names(stage_colors), drop=F)
  
p1 + p2 + p3

6 Figure #1

Heatmap of endoderm markers markers visualized in D0, D2 and D7. Should be progressive

genes <- c("Leo1", "Rtf1", "Tnrc6c", "Fn1", "Nanog", "Sox2", "Nodal", "Cdc73", "Ctr9",
  "Ctnnb1", "Itgav", "Paf1", "Pou5f1", "Smad2", "Dusp5", "Hnf1b", "Sox7", "Eomes", "Dusp1", "Hsbp1", "Itga5",
  "Gata4", "Setd2", "Sox17", "Dkk1", "Dusp4", "Gata6")

tmp <- subset[, subset$louvain %in% c('1', '3', '7', '4') & subset$Stage %in% c('2iLIF_PrE', 'D2', 'D7')]
# DoHeatmap(tmp, features = genes, group.by = 'Stage', angle = 15, group.colors = stage_colors) +
DoHeatmap(tmp, features = genes, group.by = 'Stage', angle = 15, group.colors = as.vector(stage_colors[c('2iLIF_PrE', 'D2', 'D7')])) +
  scale_fill_gradientn(colors = c("blue", "white", "red"))

# dittoHeatmap(tmp, genes, order.by = "Stage", annot.by = c("Stage"))
mat <- GetAssayData(tmp[genes, ]) %>% as.matrix()
cluster_anno <- tmp[genes, ]$Stage

col_fun = circlize::colorRamp2(c(-2, 0, 2), c("blue", "white", "red"))

Heatmap(mat, 
        name = 'Expression',
        column_split = factor(cluster_anno),
        show_column_names = F,
        show_column_dend = F,
        show_row_dend = F,
        cluster_rows = T,
        cluster_columns = F,
        col = col_fun,
        top_annotation = HeatmapAnnotation(foo = anno_block(gp = gpar(
          fill = c('#948170', '#379F4C', '#2096B3', '#F2A082', '#CE7E41', '#B22222')))
        )
)

DoHeatmap(subset, features = genes, group.by = 'louvain', angle = 15, group.colors = seurat_colors) +
  scale_fill_gradientn(colors = c("blue", "white", "red"))

mat <- GetAssayData(subset[genes, ]) %>% as.matrix()
cluster_anno <- factor(subset[genes, ]$louvain, levels = c('1', '6', '5', '0', '2', '3', '7', '4'))
col_fun = circlize::colorRamp2(c(-2, 0, 2), c("blue", "white", "red"))

# Heatmap(mat, 
#         name = 'Expression',
#         column_split = factor(cluster_anno),
#         show_column_names = F,
#         show_column_dend = F,
#         show_row_dend = F,
#         cluster_rows = T,
#         cluster_columns = F,
#         col = col_fun,
#         top_annotation = HeatmapAnnotation(foo = anno_block(gp = gpar(
#           fill = c('#948170', '#379F4C', '#2096B3', '#F2A082', '#CE7E41', '#B22222')))
#         )
# )

7 Figure #2

Bar plot of cell cycle from 2i to D7.

tmp <- subset[, subset$louvain %in% c('1', '6', '5', '3', '7', '4')]

Idents(tmp) <- tmp$Stage
commonR::plot_proportion_bar(tmp, group.by = 'Phase') + 
  scale_fill_manual(values=cc_colors) +
  theme_classic(base_size = 12) +
  ggtitle('Cell cycle progression in PrEDiff trajectory')

nowotschin <- readRDS("../data/processed/nowotschin.rds")
nowotschin[['Celltype_combined']] <- paste(nowotschin$Timepoint, nowotschin$CellType, sep = '-')
nowotschin <- nowotschin[, nowotschin$CellType %in% c('ICM', 'EPI', 'PrE', 'VE')]

Idents(nowotschin) <- nowotschin$Celltype_combined
commonR::plot_proportion_bar(nowotschin, group.by = 'Phase') + 
  scale_fill_manual(values=cc_colors) +
  theme_classic(base_size = 12) +
  ggtitle('Cell cycle progression trajectory')

nowotschin_sub <-  nowotschin[, nowotschin$Celltype_combined %in% c(
  "E3.5-ICM",
  "E3.5-PrE",
  "E4.5-PrE",
  "E5.5-VE"
  )]
commonR::plot_proportion_bar(nowotschin_sub, group.by = 'Phase') + 
  scale_fill_manual(values=cc_colors) +
  theme_classic(base_size = 12) +
  ggtitle('Cell cycle progression trajectory')

8 Figure #3

8.1 Hhex expression from FACS

plot_ps(subset, by='P3.mCherry.561D.A.Geo.Mean') + 
  theme_classic(base_size = 12) +
  ggtitle('FACS Hhex expression')

df <- subset@meta.data %>% filter(louvain %in% c('1', '6', '5'))
p1 <- ggplot(df, aes(x=louvain, y=P3.mCherry.561D.A.Geo.Mean, fill=louvain)) + 
  geom_boxplot() + 
  geom_jitter(width=0.1,alpha=0.2) +
  labs(fill = 'Clusters') + xlab("Clusters") + ylab("FCS Hhex expression") +
  coord_cartesian(ylim = c(0, 4000)) +
  scale_fill_manual("legend", values = seurat_colors) +
  theme(legend.position="none") +
  facet_grid(cols = vars('2iLIF_PrE'))

df <- subset@meta.data %>% filter(louvain %in% c('0', '2'))
df$louvain <- factor(df$louvain, levels = c('0', '2'))
p2 <- ggplot(df, aes(x=louvain, y=P3.mCherry.561D.A.Geo.Mean, fill=louvain)) + 
  geom_boxplot() + 
  geom_jitter(width=0.1,alpha=0.2) +
  labs(fill = 'Clusters') + xlab("Clusters") + ylab("FCS Hhex expression") +
  coord_cartesian(ylim = c(0, 4000)) +
  scale_fill_manual("legend", values = seurat_colors) +
  theme(legend.position="none") +
  facet_grid(cols = vars('NEDiff'))

df <- subset@meta.data %>% filter(louvain %in% c('3', '7', '4'))
df$louvain <- factor(df$louvain, levels = c('3', '7', '4'))
p3 <- ggplot(df, aes(x=louvain, y=P3.mCherry.561D.A.Geo.Mean, fill=louvain)) + 
  geom_boxplot() + 
  geom_jitter(width=0.1,alpha=0.2) +
  labs(fill = 'Clusters') + xlab("Clusters") + ylab("FCS Hhex expression") +
  coord_cartesian(ylim = c(0, 4000)) +
  # scale_fill_manual("legend", values = seurat_colors) +
  theme(legend.position="none") +
  facet_grid(cols = vars('PrEDiff'))

p1 + p2 + p3

df <- subset@meta.data
df <- df[df$louvain %in% c('1', '6', '5', '0', '2', '3', '7', '4'), ]
df$louvain <- factor(df$louvain, levels = c('1', '6', '5', '0', '2', '3', '7', '4'))
ggplot(df, aes(x=louvain, y=P3.mCherry.561D.A.Geo.Mean, fill=louvain)) + 
  geom_boxplot() + 
  geom_jitter(width=0.1,alpha=0.2) +
  scale_fill_manual("legend", values = seurat_colors) +
  labs(fill = 'Clusters') + xlab("Clusters") + ylab("FCS Hhex expression") +
  theme_classic(base_size = 12)

8.2 Sox2 expression from FACS

plot_ps(subset, by='P3.GFP.A.Geo.Mean') + 
  theme_classic(base_size = 12) +
  ggtitle('FACS Sox2 expression')

df <- subset@meta.data %>% filter(louvain %in% c('1', '6', '5'))
p1 <- ggplot(df, aes(x=louvain, y=P3.GFP.A.Geo.Mean, fill=louvain)) + 
  geom_boxplot() + 
  geom_jitter(width=0.1,alpha=0.2) +
  labs(fill = 'Clusters') + xlab("Clusters") + ylab("FCS Sox2 expression") +
  coord_cartesian(ylim = c(0, 4000)) +
  scale_fill_manual("legend", values = seurat_colors) +
  theme(legend.position="none") +
  facet_grid(cols = vars('2iLIF_PrE'))

df <- subset@meta.data %>% filter(louvain %in% c('0', '2'))
df$louvain <- factor(df$louvain, levels = c('0', '2'))
p2 <- ggplot(df, aes(x=louvain, y=P3.GFP.A.Geo.Mean, fill=louvain)) + 
  geom_boxplot() + 
  geom_jitter(width=0.1,alpha=0.2) +
  labs(fill = 'Clusters') + xlab("Clusters") + ylab("FCS Sox2 expression") +
  coord_cartesian(ylim = c(0, 4000)) +
  scale_fill_manual("legend", values = seurat_colors) +
  theme(legend.position="none") +
  facet_grid(cols = vars('NEDiff'))

df <- subset@meta.data %>% filter(louvain %in% c('3', '7', '4'))
df$louvain <- factor(df$louvain, levels = c('3', '7', '4'))
p3 <- ggplot(df, aes(x=louvain, y=P3.GFP.A.Geo.Mean, fill=louvain)) + 
  geom_boxplot() + 
  geom_jitter(width=0.1,alpha=0.2) +
  labs(fill = 'Clusters') + xlab("Clusters") + ylab("FCS Sox2 expression") +
  coord_cartesian(ylim = c(0, 4000)) +
  # scale_fill_manual("legend", values = seurat_colors) +
  theme(legend.position="none") +
  facet_grid(cols = vars('PrEDiff'))

p1 + p2 + p3

9 Figure #4

9.1 Cell size

Problem: D0 in from pseudotime is later then D1. I can redo the pseudotime by removing 2iLIF and setting D0 as a starting point. Otherwise I can start just ignore the D0.

df <- subset@meta.data %>%
  filter(louvain %in% c('1', '6', '5', '3', '7', '4')) %>%
  dplyr::select(louvain, cs_mean, cs_median, dpt_pseudotime, Stage) %>%
  dplyr::filter(!is.na(cs_mean)) %>%
  dplyr::arrange(dpt_pseudotime)

ggplot() + 
  geom_point(data=df, aes(x=dpt_pseudotime, y=cs_mean, color=Stage)) +
  geom_smooth(data=df, aes(x=dpt_pseudotime, y=cs_mean)) +
  ggtitle('Mean cell size normalized by 2iLIF') +
  scale_color_manual("legend", values = stage_colors) +
  ylim(0, 2)

ggplot() + 
  geom_point(data=df, aes(x=dpt_pseudotime, y=cs_median, color=Stage)) +
  geom_smooth(data=df, aes(x=dpt_pseudotime, y=cs_median)) +
  scale_color_manual("legend", values = stage_colors) +
  ggtitle('Median cell size normalized by 2iLIF') +
  ylim(0, 2)

10 Figure #5 - Signaling pathways from CAT

Figures can be found in ../reports/

11 Figure #6 - vivo vs vitro

Idents(subset) <- subset$louvain
vitro_markers <- FindAllMarkers(subset, only.pos = T)
vitro_markers_filt <- vitro_markers %>%
    group_by(cluster) %>%
    filter(p_val_adj < 0.05 & avg_log2FC > 0.5) %>%
    slice_max(n = 20, order_by = avg_log2FC)
vitro_markers_filt

Source code stolen from Jose with his blessing.

library(GeneOverlap)
library(reshape2)
library(ggplot2)

marker_database <- read.delim('../data/marker_database.tsv', stringsAsFactors=FALSE)
marker_database$genes <- strsplit(tolower(marker_database$genes), split = ",")

database_lists <- marker_database$genes
names(database_lists) <- paste(marker_database$publication, "|", marker_database$cluster)
database_lists <- database_lists[grepl("Nowotschin", names(database_lists))]

data(GeneOverlap)
genomic_size <- gs.RNASeq

my_list <- vitro_markers_filt[c("cluster", "gene")]
my_list <- sapply(as.vector(unique(my_list$cluster)), FUN = function(x, gene_list){
  return(my_list %>% filter(cluster == x) %>% pull(gene) %>% tolower)
}, my_list, USE.NAMES = TRUE, simplify = FALSE)

# run GOM
gom.obj <- newGOM(gsetA = my_list, gsetB = database_lists, genome.size = genomic_size)

# extract data
pvals <- getMatrix(gom.obj, name = "pval")
odds <- getMatrix(gom.obj, name = "odds.ratio")
log_odds <- log2(odds)

11.1 Heatmap based on p-value

# heatmap based on p-values
melt_pvals <- melt(pvals)
melt_pvals$value <- p.adjust(melt_pvals$value, method = "bonferroni")
colnames(melt_pvals) <- c("Louvain", "Database_lists", "value")

melt_pvals$Study <- unlist(lapply(strsplit(as.character(melt_pvals$Database_lists), split =  "|", fixed = T), "[", 1))
melt_pvals$Study <- trimws(melt_pvals$Study)

melt_pvals$cell_type <- unlist(lapply(strsplit(as.character(melt_pvals$Database_lists), split =  "|", fixed = T), "[", 2))
melt_pvals$cell_type <- trimws(melt_pvals$cell_type)

melt_pvals <- melt_pvals[melt_pvals$Louvain %in% c('1', '6', '5', '0', '2', '3', '7', '4'), ]
melt_pvals$Louvain <- factor(melt_pvals$Louvain, levels = c('1', '6', '5', '0', '2', '3', '7', '4'))
ggplot(melt_pvals) + 
  geom_tile(aes(x = Louvain, y = cell_type, fill = -log10(value)), color = "white") + 
  #coord_equal() + # Makes the heatmap tiles square, not compatible with facet_grid
  labs(x = "Louvain", y = "Database gene lists", fill = "-log10(p-value)") +
  scale_fill_distiller(palette = "Spectral") +
  facet_grid(Study~., scales = "free", space = "free_y") +
  theme(axis.text.x = element_text(angle = 90), strip.text.y = element_text(angle = 0)) 

melt_logodds <- melt(log_odds)
melt_logodds <- melt_logodds[melt_logodds$Var1 %in% c('1', '6', '5', '0', '2', '3', '7', '4'), ]

colnames(melt_logodds) <- c("PC_lists","Database_lists","value")

melt_logodds$Study <- unlist(lapply(strsplit(as.character(melt_logodds$Database_lists), split =  "|", fixed = T), "[", 1))
melt_logodds$Study <- trimws(melt_logodds$Study)

melt_logodds$cell_type <- unlist(lapply(strsplit(as.character(melt_logodds$Database_lists), split =  "|", fixed = T), "[", 2))
melt_logodds$cell_type <- trimws(melt_logodds$cell_type)

melt_pvals$odds <- melt_logodds$value

melt_pvals$odds[!is.finite(melt_pvals$odds)] <- NA
limit <- max(abs(melt_pvals$odds), na.rm = T) * c(-1, 1)
ggplot(melt_pvals) + geom_point(aes(x = factor(Louvain), y = factor(cell_type), color = odds, size = -log10(value) )) + 
  labs(x = "Louvain", y = "Database gene lists", color = "log2(odds ratio)", size = "-log10(p-value)") +
  scale_color_distiller(palette = "RdBu", direction = -1, limit = limit) + scale_size_continuous(range = c(1,5)) +
  facet_grid(Study~., scales = "free", space = "free", drop = T) + theme_bw()

11.2 Genes

genes <- c("Klf2", "Klf4", "Klf5", "Nanog", "Sox2", "Pou5f1", "Otx2", "Pou3f1", 
           "Cdx2", "Eomes", "Fgf5", "Elf5", "Zfp42")
for(gene in genes) {
  if (gene %in% rownames(subset)) {
    p <- FeaturePlot(subset, features= gene, pt.size = 1, cols = c("#FFE4FD","magenta4"))
    print(p)
  }
}